Batchnormgrad

逐元素计算加法梯度

计算批标准化 (Batch Normalization) 的梯度。

该算子计算损失函数 L 分别对输入 x、缩放因子 scale (γ) 和偏置 bias (β) 的梯度。其中 bias 的梯度为 dbias

\[\begin{split}dscale(\gamma) &= \sum_{i=1}^{m} dy_i \cdot \hat{x}_i \\ dbias(\beta) &= \sum_{i=1}^{m} dy_i\end{split}\]
\[dx_i = \frac{\gamma}{m\sqrt{\sigma^2 + \epsilon}} \left[ m \cdot dy_i - \sum_{j=1}^{m}dy_j - \hat{x}_i \sum_{j=1}^{m}dy_j \hat{x}_j \right]\]

其中 \(m\) 是批处理大小 (batch),\(\hat{x}\) 是归一化后的 \(x\)

输入:
  • x - 前向传播时的输入张量。

  • dy - 来自后一层的上游梯度。

  • params - 其他参数打包成数组。

  • core_mask - 核掩码。

输出:
  • dx - 对输入 x 的梯度。

  • dbias - 对偏置 bias (β) 的梯度。

  • dscale - 对缩放因子 scale (γ) 的梯度。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp32

  • MT7004 支持fp16, fp32

参数数组结构:

1long long params[12];
2params[0] = (long long)mean; 前向传播时计算的均值。
3params[1] = (long long)invar; 前向传播时计算的逆方差 (1 / sqrt(variance + epsilon))。
4params[2] = (long long)scale; 前向传播时使用的缩放因子 (gamma, γ)。
5params[3] = (long long)dbias; 对偏置 `bias` (β) 的梯度。
6params[4] = (long long)dscale; 对缩放因子 `scale` (γ) 的梯度。
7params[5] = (long long)batch; 批处理大小。
8params[6] = (long long)channel; 通道数。
9params[7] = (long long)is_train; 是否为训练模式。

共享存储版本:

void fp_batch_norm_grad_s(float *x, float *dy, float *dx, int core_mask)
void hp_batch_norm_grad_s(half *x, half *dy, half *dx, int core_mask)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <batchnormgrad.h>
 4int main(int argc, char* argv[]) {
 5    float *x = (float *)0xA0000000;          // forward input x
 6    float *dy = (float *)0xB0000000;         // upstream gradient dy
 7    float *mean = (float *)0xC0000000;       // forward mean
 8    float *invar = (float *)0xD0000000;      // forward inverse variance
 9    float *scale = (float *)0xE0000000;      // forward scale (gamma)
10
11    float *dx = (float *)0xA1000000;         // output gradient dx
12    float *dbias = (float *)0xB1000000;      // output gradient dbias
13    float *dscale = (float *)0xC1000000;     // output gradient dscale
14
15    int batch = 4;
16    int channel = 64;
17    int is_train = true;
18    int core_mask = 0xff;
19
20    long long params[12];
21    params[0] = (long long)mean;
22    params[1] = (long long)invar;
23    params[2] = (long long)scale;
24    params[3] = (long long)dbias;
25    params[4] = (long long)dscale;
26    params[5] = (long long)batch;
27    params[6] = (long long)channel;
28    params[7] = (long long)is_train;
29    fp_batch_norm_grad_s(x, dy, dx, core_mask);
30    return 0;
31}

私有存储版本:

void fp_batch_norm_grad_p(float *x, float *dy, float *dx, long long *params)
void hp_batch_norm_grad_p(half *x, half *dy, half *dx, long long *params)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <batchnormgrad.h>
 4int main(int argc, char* argv[]) {
 5    float *x = (float *)0x10000000;          // forward input x in L2 space
 6    float *dy = (float *)0x10100000;         // upstream gradient dy
 7    float *mean = (float *)0x10200000;       // forward mean
 8    float *invar = (float *)0x10300000;      // forward inverse variance
 9    float *scale = (float *)0x10400000;      // forward scale (gamma)
10
11    float *dx = (float *)0x10500000;         // output gradient dx
12    float *dbias = (float *)0x10600000;      // output gradient dbias
13    float *dscale = (float *)0x10700000;     // output gradient dscale
14
15    int batch = 4;
16    int channel = 32;
17    int is_train = true;
18
19    long long params[12];
20    params[0] = (long long)mean;
21    params[1] = (long long)invar;
22    params[2] = (long long)scale;
23    params[3] = (long long)dbias;
24    params[4] = (long long)dscale;
25    params[5] = (long long)batch;
26    params[6] = (long long)channel;
27    params[7] = (long long)is_train;
28
29    fp_batch_norm_grad_p(x, dy, dx, params);
30    return 0;
31}